#Essential Imports
import numpy as np
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
#Models
#!pip install vecstack
!pip install numpy
from vecstack import stacking
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
#Others
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from datetime import datetime
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import recall_score, roc_auc_score, brier_score_loss, roc_curve, f1_score, classification_report, confusion_matrix
#from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import time
import warnings
warnings.filterwarnings("ignore")
#Model Explainer
import shap
import matplotlib.pyplot as plt
Requirement already satisfied: numpy in c:\users\gavin ng\anaconda3\lib\site-packages (1.20.3)
start_time = datetime.now()
import os
path = os.getcwd() # use your path
all_files = glob.iglob(os.path.join(path, "features\\part-*.csv")) # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
train = concatenated_df
labels = pd.read_csv(os.path.join(path,"labels\part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv"))
# Take a look of how our sample looks like
np.random.seed(555)
train.sample(5)
| bookingID | Accuracy | Bearing | acceleration_x | acceleration_y | acceleration_z | gyro_x | gyro_y | gyro_z | second | Speed | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4183140 | 206158430355 | 3.000 | 218.0 | -0.502798 | 9.490906 | -1.613741 | -0.018849 | 0.021240 | -0.021507 | 640.0 | 0.000000 |
| 2894542 | 26 | 4.607 | 310.0 | -0.354353 | 9.658504 | -0.794899 | 0.023761 | 0.001567 | 0.014855 | 607.0 | 0.000000 |
| 9951393 | 1657857376354 | 8.000 | 298.0 | -0.162811 | 9.059936 | 3.332830 | -0.102606 | 0.065408 | -0.056617 | 801.0 | 20.808031 |
| 12007808 | 635655159910 | 3.000 | 6.0 | 0.428610 | 8.727334 | 4.320509 | 0.012741 | -0.018500 | 0.011083 | 656.0 | 0.593728 |
| 941316 | 163208757339 | 5.000 | 221.0 | -0.107739 | 9.607931 | 1.551443 | 0.023824 | 0.005498 | 0.001222 | 723.0 | 6.560000 |
np.random.seed(555)
labels.sample(5)
| bookingID | label | |
|---|---|---|
| 11743 | 1022202216472 | 0 |
| 10401 | 1503238553774 | 1 |
| 3303 | 867583393862 | 0 |
| 13096 | 369367187649 | 0 |
| 6353 | 489626271866 | 1 |
print("Telemetics Data","\n",train.shape)
print("Telemetics Data","\n",labels.shape)
Telemetics Data (16135561, 11) Telemetics Data (20018, 2)
# check if there are any empty values
print(train.isnull().sum().sum())
print(labels.isnull().sum().sum())
len(train.groupby('bookingID'))
# seems like there are 18 extra booking IDs in the labels info set relative to the train data set
# Find booking IDs that appear >1 and put them into a temp variable
temp = pd.crosstab(index = labels['bookingID'], columns = 'count')# crosstab creates a table of 2 variables that you specify.
temp = temp[temp['count']!=1] # != means not equal to 1. We find a list of variables in temp that are not equal to 1.
print(temp)
print()
#Dropping bookingIDs with multiple labels and put them into new variable called new_labels
drop = temp.index.values
new_labels = labels[~labels['bookingID'].isin(drop)] # include all data in new_label except for those with duplicate booking IDs
print()
print(new_labels.shape)
print()
print(new_labels.sample(5))
0
0
col_0 count
bookingID
13 2
154618822837 2
223338299461 2
395136991308 2
403726925929 2
455266533495 2
481036337234 2
515396075694 2
695784702084 2
919123001348 2
970662608932 2
1279900254294 2
1348619731077 2
1391569403991 2
1408749273124 2
1511828488211 2
1632087572573 2
1649267441751 2
(19982, 2)
bookingID label
6396 1460288880649 0
4118 987842478130 0
8240 292057776146 0
13496 1314259992615 0
13018 403726925954 1
# check number of trips labeled as dangerous, 4,983 labeled 1 as dangerous driving
print("No.of 0 and 1 in the label dataset","\n", new_labels.label.value_counts(),"\n")
# Merge Telemetic and label dataset
tele = pd.merge(train,new_labels,on="bookingID")
print ("Combined dataset", "\n", tele.shape)
# double check to make sure that we got our telematics data set to merge properly.
# Also get a sense of how the telematics data looks like after we merge them with labels...
# randomly pick a booking ID (e.g. 10)- what we see is a second by second telematic information for 1 trip (with bookingID 88)
tele = tele.sort_values(by=['bookingID','second']).reset_index(drop=True)
tele[tele.bookingID==10]
No.of 0 and 1 in the label dataset 0 14999 1 4983 Name: label, dtype: int64 Combined dataset (16116704, 12)
| bookingID | Accuracy | Bearing | acceleration_x | acceleration_y | acceleration_z | gyro_x | gyro_y | gyro_z | second | Speed | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5575 | 10 | 3.0 | 0.0 | 0.928955 | 9.296692 | -1.161194 | -0.013931 | -0.007431 | 0.006516 | 0.0 | 0.00 | 0 |
| 5576 | 10 | 3.0 | 0.0 | 0.794876 | 9.326614 | -1.337158 | 0.006302 | -0.002213 | 0.002243 | 1.0 | 0.00 | 0 |
| 5577 | 10 | 3.0 | 0.0 | 0.787689 | 9.301468 | -1.070206 | -0.016068 | -0.003250 | 0.006516 | 2.0 | 0.00 | 0 |
| 5578 | 10 | 3.0 | 0.0 | 0.746994 | 9.336197 | -1.231812 | 0.015884 | 0.000931 | -0.003082 | 3.0 | 0.00 | 0 |
| 5579 | 10 | 3.0 | 0.0 | 0.854736 | 9.543289 | -1.192307 | 0.032928 | 0.000931 | -0.009476 | 4.0 | 0.00 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5945 | 10 | 3.9 | 117.0 | 1.810013 | 9.018951 | -1.345535 | 0.015106 | 0.011658 | -0.026062 | 370.0 | 18.57 | 0 |
| 5946 | 10 | 3.9 | 117.0 | 0.377090 | 10.132263 | -1.854309 | -0.001724 | -0.000656 | 0.013351 | 371.0 | 18.50 | 0 |
| 5947 | 10 | 3.9 | 117.0 | 0.051468 | 9.044098 | -1.988388 | -0.007904 | 0.002670 | -0.002182 | 372.0 | 18.27 | 0 |
| 5948 | 10 | 3.9 | 117.0 | 2.486374 | 8.275558 | -1.874664 | -0.013657 | -0.002960 | -0.005173 | 373.0 | 18.11 | 0 |
| 5949 | 10 | 3.9 | 117.0 | -0.600952 | 10.137054 | -2.427719 | 0.000824 | 0.002258 | -0.002838 | 374.0 | 18.21 | 0 |
375 rows × 12 columns
# Sort the rows based on No.of counts
label_df = tele.groupby('bookingID')\
.agg({'Accuracy':'count'}, as_index=False)\
.rename(columns={'Accuracy':'count'})\
.merge(tele,on='bookingID')\
.sort_values(by='count', ascending=False)
# Select the booking ID with the most no.of lines in "1" group also known as dangerous driving
label_df[label_df['label']==1].head()
| bookingID | count | Accuracy | Bearing | acceleration_x | acceleration_y | acceleration_z | gyro_x | gyro_y | gyro_z | second | Speed | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4628645 | 438086664371 | 7561 | 3.000 | 0.0 | -0.289698 | 9.713276 | 1.426944 | 0.006188 | 0.004496 | -0.009505 | 1275.0 | 0.00 | 1 |
| 4629999 | 438086664371 | 7561 | 3.298 | 95.0 | 0.075417 | 9.879673 | 1.744176 | 0.011179 | -0.011484 | 0.012747 | 3149.0 | 1.74 | 1 |
| 4629987 | 438086664371 | 7561 | 3.685 | 93.0 | 0.538695 | 9.512163 | 1.307234 | 0.021564 | -0.042028 | 0.056729 | 3137.0 | 3.57 | 1 |
| 4629988 | 438086664371 | 7561 | 3.633 | 94.0 | -0.089783 | 9.518148 | 1.698686 | -0.001038 | -0.005986 | 0.018245 | 3138.0 | 3.19 | 1 |
| 4629989 | 438086664371 | 7561 | 3.539 | 93.0 | 0.231040 | 9.850943 | 2.323573 | -0.014478 | -0.004765 | -0.013520 | 3139.0 | 2.76 | 1 |
label_df[label_df['label']==0].head()
| bookingID | count | Accuracy | Bearing | acceleration_x | acceleration_y | acceleration_z | gyro_x | gyro_y | gyro_z | second | Speed | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2440975 | 223338299473 | 2355 | 12.0 | 243.547516 | 3.143549 | -4.667172 | -7.326672 | 0.311425 | 0.083308 | 0.083045 | 0.0 | 0.063314 | 0 |
| 2442259 | 223338299473 | 2355 | 48.0 | 350.140594 | 0.377429 | -8.490961 | 1.040472 | -0.285346 | -0.662202 | -0.151095 | 1309.0 | -1.000000 | 0 |
| 2442249 | 223338299473 | 2355 | 24.0 | 281.207031 | -0.323447 | -8.048035 | -3.359778 | 0.029960 | -0.053705 | -0.081434 | 1299.0 | 2.818063 | 0 |
| 2442250 | 223338299473 | 2355 | 32.0 | 283.398041 | 1.511063 | -15.655835 | -3.981699 | -0.113846 | -0.108543 | -0.011182 | 1300.0 | -1.000000 | 0 |
| 2442251 | 223338299473 | 2355 | 32.0 | 283.398041 | -0.158508 | -8.588907 | -2.026663 | 0.007388 | -0.006578 | -0.003180 | 1301.0 | -1.000000 | 0 |
# Booking ID 233338299473 save and 438086664371 dangerous for EDA
temp_safe_df = tele[tele['bookingID'] == 223338299473].sort_values(by='second')
temp_unsafe_df = tele[tele['bookingID'] == 438086664371].sort_values(by='second')
# Create function for EDA plotting
def plot(vals, title="", xlabel="", ylabel=''):
fig = plt.figure(figsize=(14,3))
ax = plt.subplot(111)
for val in vals:
label = ''
c = 'b'
if 'label' in val:
label = val['label']
if 'c' in val:
c = val['c']
if 'x' in val:
x = val['x']
else:
x = [x for x in range(len(val['y']))]
ax.plot(val['y'], label=label,c=c)
# Shrink current axis by 20%
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.show()
# Plot for Safe/Unsafe Acceleration, Safe/Unsafe Gyro, Safe/Unsafe Bearing
# Acceleration
vals = [
{'x':temp_safe_df['second'].values,'y':temp_safe_df['acceleration_x'].values, 'label':'x', 'c':'r'},
{'x':temp_safe_df['second'].values,'y':temp_safe_df['acceleration_y'].values, 'label':'y', 'c':'g'},
{'x':temp_safe_df['second'].values,'y':temp_safe_df['acceleration_z'].values, 'label':'z', 'c':'b'},
]
plot(vals, title='Safe Acceleration', xlabel='second', ylabel='m/s2')
vals = [
{'x':temp_unsafe_df['second'].values,'y':temp_unsafe_df['acceleration_x'].values, 'label':'x', 'c':'r'},
{'x':temp_unsafe_df['second'].values,'y':temp_unsafe_df['acceleration_y'].values, 'label':'y', 'c':'g'},
{'x':temp_unsafe_df['second'].values,'y':temp_unsafe_df['acceleration_z'].values, 'label':'z', 'c':'b'},
]
plot(vals, title='un-Safe Acceleration', xlabel='second', ylabel='m/s2')
# Gyro
vals = [
{'x':temp_safe_df['second'].values,'y':temp_safe_df['gyro_x'].values, 'label':'x', 'c':'r'},
{'x':temp_safe_df['second'].values,'y':temp_safe_df['gyro_y'].values, 'label':'y', 'c':'y'},
{'x':temp_safe_df['second'].values,'y':temp_safe_df['gyro_z'].values, 'label':'z', 'c':'b'},
]
plot(vals, title='Safe Gyro', xlabel='second', ylabel='rad/s')
vals = [
{'x':temp_unsafe_df['second'].values,'y':temp_unsafe_df['gyro_x'].values, 'label':'x', 'c':'r'},
{'x':temp_unsafe_df['second'].values,'y':temp_unsafe_df['gyro_y'].values, 'label':'y', 'c':'y'},
{'x':temp_unsafe_df['second'].values,'y':temp_unsafe_df['gyro_z'].values, 'label':'z', 'c':'b'},
]
plot(vals, title='unSafe Gyro', xlabel='second', ylabel='rad/s')
# Bearing
xlabel='second'
ylabel='m/s2'
plot([{'x':temp_safe_df['second'].values,
'y':temp_safe_df['Bearing'].values, 'label':'Safe Bearing', 'c':'g'}],
'Safe Bearing', xlabel,ylabel)
plot([{'x':temp_unsafe_df['second'].values,
'y':temp_unsafe_df['Bearing'].values, 'label':'unSafe Speed', 'c':'r'}],
'un-Safe Bearing', xlabel,ylabel)
xlabel='second'
ylabel='m/s'
plot([{'x':temp_safe_df['second'].values,
'y':temp_safe_df['Speed'].values, 'label':'Safe Speed', 'c':'g'}],
'Safe Speed', xlabel,ylabel)
plot([{'x':temp_unsafe_df['second'].values,
'y':temp_unsafe_df['Speed'].values, 'label':'unSafe Speed', 'c':'r'}],
'un-Safe Speed', xlabel,ylabel)
temp_safe_df['acceration'] = (temp_safe_df['Speed']-temp_safe_df['Speed'].shift(1))/(temp_safe_df['second']-temp_safe_df['second'].shift(1))
temp_unsafe_df['acceration'] = (temp_unsafe_df['Speed']-temp_unsafe_df['Speed'].shift(1))/(temp_unsafe_df['second']-temp_unsafe_df['second'].shift(1))
xlabel='second'
ylabel='m/s2'
plot([
{'x':temp_safe_df['second'].values,
'y':temp_safe_df['acceration'].values, 'label':'Safe Speed', 'c':'g'},
],
'Safe acceleration', xlabel,ylabel)
plot([{'x':temp_unsafe_df['second'].values,
'y':temp_unsafe_df['acceration'].values, 'label':'unSafe Speed', 'c':'r'}],
'un-Safe acceleration', xlabel,ylabel)
tele.describe()
| bookingID | Accuracy | Bearing | acceleration_x | acceleration_y | acceleration_z | gyro_x | gyro_y | gyro_z | second | Speed | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 | 1.611670e+07 |
| mean | 8.184342e+11 | 1.161327e+01 | 1.689596e+02 | 6.897958e-02 | 4.473604e+00 | 8.958685e-01 | -1.708257e-03 | 2.938396e-04 | -2.601239e-04 | 3.807072e+03 | 9.006461e+00 | 3.052742e-01 |
| std | 4.951572e+11 | 8.696429e+01 | 1.072970e+02 | 1.424623e+00 | 8.128499e+00 | 3.250204e+00 | 1.446192e-01 | 3.402431e-01 | 1.481542e-01 | 1.437527e+06 | 8.106538e+00 | 4.605235e-01 |
| min | 0.000000e+00 | 7.500000e-01 | 0.000000e+00 | -7.841969e+01 | -7.299412e+01 | -7.844842e+01 | -4.845575e+01 | -7.488861e+01 | -5.355445e+01 | 0.000000e+00 | -2.000000e+00 | 0.000000e+00 |
| 25% | 3.779571e+11 | 3.900000e+00 | 7.800000e+01 | -5.084229e-01 | -1.932755e+00 | -9.298957e-01 | -2.679162e-02 | -2.993423e-02 | -1.876969e-02 | 2.410000e+02 | 1.020000e+00 | 0.000000e+00 |
| 50% | 8.074539e+11 | 4.244000e+00 | 1.687888e+02 | 6.120300e-02 | 9.082404e+00 | 7.769897e-01 | -6.420414e-04 | 2.663161e-04 | -3.726278e-05 | 5.200000e+02 | 7.530000e+00 | 0.000000e+00 |
| 75% | 1.254130e+12 | 8.000000e+00 | 2.629526e+02 | 6.347809e-01 | 9.709925e+00 | 2.751175e+00 | 2.331571e-02 | 3.144551e-02 | 1.823425e-02 | 8.620000e+02 | 1.548000e+01 | 1.000000e+00 |
| max | 1.709397e+12 | 6.070101e+03 | 3.599995e+02 | 6.687346e+01 | 7.505589e+01 | 7.805576e+01 | 3.983975e+01 | 8.031496e+01 | 6.630078e+01 | 1.495797e+09 | 1.480186e+02 | 1.000000e+00 |
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#Filter out telematics data with negative speeds
temp3 = tele [(tele ['Speed'] > 0)]
print(tele.shape)
print(temp3.shape)
# filter our telematics data with extremely large accuracy figures > 95 percentile. Accuracy is a misnomer, larger the accuracy, the less accurate.
temp3 = temp3.loc[(temp3.Accuracy <= np.percentile(train.Accuracy, 95))]
print(temp3.shape)
df = temp3.drop(['label'],axis =1)
df ['Resultant Acceleration'] = np.sqrt((df['acceleration_x'])**2 + (df['acceleration_y'])**2 + (df['acceleration_z'])**2)
df ['Resultant gyro'] = np.sqrt((df['gyro_x'])**2 + (df['gyro_y'])**2 + (df['gyro_z'])**2)
df ['Resultant Acc & Gyro'] = df ['Resultant Acceleration'] * df ['Resultant gyro']
df ['harsh_yaw'] = df ['gyro_x'] * df ['Speed']
df ['harsh_pitch'] = df ['gyro_z'] * df ['Speed']
df['velocity'] = (df['Speed']-df['Speed'].shift(1))/(df['second']-df['second'].shift(1))
df.replace([np.inf,-np.inf], np.nan, inplace=True) # Printing dfdf
df.dropna(inplace=True)
df.head()
#Normalize data before further processing
#columns_to_norm = df.drop(['bookingID'], axis=1).columns
#df[columns_to_norm] = MinMaxScaler().fit_transform(df[columns_to_norm])
#df.head()
df_mean = df.groupby('bookingID').mean().add_prefix('mean_')
df_std = df.groupby('bookingID').std().add_prefix('std_')
df_min = df.groupby('bookingID').min().add_prefix('min_')
df_max = df.groupby('bookingID').max().add_prefix('max_')
df_sum = df.groupby('bookingID').sum().add_prefix('sum_')
df_var = df.groupby('bookingID').var().add_prefix('var_')
df_1 = pd.merge(df_mean, df_std, on = 'bookingID', how = 'inner')
df_2 = pd.merge(df_min, df_max, on = 'bookingID', how = 'inner')
df_3 = pd.merge(df_sum, df_var, on = 'bookingID', how = 'inner')
df_4 = pd.merge(df_1, df_2, on = 'bookingID', how = 'inner')
temp4 = pd.merge(df_4, df_3, on = 'bookingID', how = 'inner')
# Dropping all the rows with nan valuesdf.dropna(inplace=True)
temp4.replace([np.inf,-np.inf], np.nan, inplace=True) # Printing dfdf
temp4.dropna(inplace=True)
print(temp4.shape)
#Merge Train data with Labels, dropping telematics data without label.
train = pd.merge(temp4, labels, on = 'bookingID', how = 'inner')
print(train .shape)
(16116704, 12) (13109640, 12) (12690147, 12) (19928, 96) (19928, 98)
#final check before regression
train.info()
train.to_csv('train.csv',index=False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 19928 entries, 0 to 19927 Data columns (total 98 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 bookingID 19928 non-null int64 1 mean_Accuracy 19928 non-null float64 2 mean_Bearing 19928 non-null float64 3 mean_acceleration_x 19928 non-null float64 4 mean_acceleration_y 19928 non-null float64 5 mean_acceleration_z 19928 non-null float64 6 mean_gyro_x 19928 non-null float64 7 mean_gyro_y 19928 non-null float64 8 mean_gyro_z 19928 non-null float64 9 mean_second 19928 non-null float64 10 mean_Speed 19928 non-null float64 11 mean_Resultant Acceleration 19928 non-null float64 12 mean_Resultant gyro 19928 non-null float64 13 mean_Resultant Acc & Gyro 19928 non-null float64 14 mean_harsh_yaw 19928 non-null float64 15 mean_harsh_pitch 19928 non-null float64 16 mean_velocity 19928 non-null float64 17 std_Accuracy 19928 non-null float64 18 std_Bearing 19928 non-null float64 19 std_acceleration_x 19928 non-null float64 20 std_acceleration_y 19928 non-null float64 21 std_acceleration_z 19928 non-null float64 22 std_gyro_x 19928 non-null float64 23 std_gyro_y 19928 non-null float64 24 std_gyro_z 19928 non-null float64 25 std_second 19928 non-null float64 26 std_Speed 19928 non-null float64 27 std_Resultant Acceleration 19928 non-null float64 28 std_Resultant gyro 19928 non-null float64 29 std_Resultant Acc & Gyro 19928 non-null float64 30 std_harsh_yaw 19928 non-null float64 31 std_harsh_pitch 19928 non-null float64 32 std_velocity 19928 non-null float64 33 min_Accuracy 19928 non-null float64 34 min_Bearing 19928 non-null float64 35 min_acceleration_x 19928 non-null float64 36 min_acceleration_y 19928 non-null float64 37 min_acceleration_z 19928 non-null float64 38 min_gyro_x 19928 non-null float64 39 min_gyro_y 19928 non-null float64 40 min_gyro_z 19928 non-null float64 41 min_second 19928 non-null float64 42 min_Speed 19928 non-null float64 43 min_Resultant Acceleration 19928 non-null float64 44 min_Resultant gyro 19928 non-null float64 45 min_Resultant Acc & Gyro 19928 non-null float64 46 min_harsh_yaw 19928 non-null float64 47 min_harsh_pitch 19928 non-null float64 48 min_velocity 19928 non-null float64 49 max_Accuracy 19928 non-null float64 50 max_Bearing 19928 non-null float64 51 max_acceleration_x 19928 non-null float64 52 max_acceleration_y 19928 non-null float64 53 max_acceleration_z 19928 non-null float64 54 max_gyro_x 19928 non-null float64 55 max_gyro_y 19928 non-null float64 56 max_gyro_z 19928 non-null float64 57 max_second 19928 non-null float64 58 max_Speed 19928 non-null float64 59 max_Resultant Acceleration 19928 non-null float64 60 max_Resultant gyro 19928 non-null float64 61 max_Resultant Acc & Gyro 19928 non-null float64 62 max_harsh_yaw 19928 non-null float64 63 max_harsh_pitch 19928 non-null float64 64 max_velocity 19928 non-null float64 65 sum_Accuracy 19928 non-null float64 66 sum_Bearing 19928 non-null float64 67 sum_acceleration_x 19928 non-null float64 68 sum_acceleration_y 19928 non-null float64 69 sum_acceleration_z 19928 non-null float64 70 sum_gyro_x 19928 non-null float64 71 sum_gyro_y 19928 non-null float64 72 sum_gyro_z 19928 non-null float64 73 sum_second 19928 non-null float64 74 sum_Speed 19928 non-null float64 75 sum_Resultant Acceleration 19928 non-null float64 76 sum_Resultant gyro 19928 non-null float64 77 sum_Resultant Acc & Gyro 19928 non-null float64 78 sum_harsh_yaw 19928 non-null float64 79 sum_harsh_pitch 19928 non-null float64 80 sum_velocity 19928 non-null float64 81 var_Accuracy 19928 non-null float64 82 var_Bearing 19928 non-null float64 83 var_acceleration_x 19928 non-null float64 84 var_acceleration_y 19928 non-null float64 85 var_acceleration_z 19928 non-null float64 86 var_gyro_x 19928 non-null float64 87 var_gyro_y 19928 non-null float64 88 var_gyro_z 19928 non-null float64 89 var_second 19928 non-null float64 90 var_Speed 19928 non-null float64 91 var_Resultant Acceleration 19928 non-null float64 92 var_Resultant gyro 19928 non-null float64 93 var_Resultant Acc & Gyro 19928 non-null float64 94 var_harsh_yaw 19928 non-null float64 95 var_harsh_pitch 19928 non-null float64 96 var_velocity 19928 non-null float64 97 label 19928 non-null int64 dtypes: float64(96), int64(2) memory usage: 15.1 MB
X = train.drop(['bookingID','label'],axis =1)
y = train['label']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=8)
print(X_train.shape, X_test.shape)
(13949, 96) (5979, 96)
rf = RandomForestClassifier(random_state=53, n_jobs=-1,n_estimators=100, max_depth=3)
log = LogisticRegression(random_state=53,n_jobs=-1,solver='lbfgs')
gnb = GaussianNB()
gbc = GradientBoostingClassifier (random_state=53,n_estimators=100,learning_rate=0.05)
models = [rf,log,gnb,gbc,xgb]
plt.figure(figsize=(10, 8))
#RandomForest
models[0].fit(X_train,y_train)
pred = models[0].predict_proba(X_test)[:,1]
y_pred = rf.predict(X_test)
rf_acc = metrics.accuracy_score (y_test,y_pred)
rf_auc = metrics.roc_auc_score(y_test,pred)
fpr, tpr, _ = metrics.roc_curve(y_test, pred)
plt.plot(fpr, tpr, color='y', label='Random Forest ROC curve (area = %0.2f)' % rf_auc)
#LogisticRegression
models[1].fit(X_train,y_train)
pred = models[1].predict_proba(X_test)[:,1]
y_pred = log.predict(X_test)
log_acc = metrics.accuracy_score (y_test,y_pred)
log_auc = metrics.roc_auc_score(y_test,pred)
fpr, tpr, _ = metrics.roc_curve(y_test, pred)
plt.plot(fpr, tpr, color='black', label='LogisticRegression ROC curve (area = %0.2f)' % log_auc)
#GaussianNB
models[2].fit(X_train,y_train)
pred = models[2].predict_proba(X_test)[:,1]
y_pred = gnb.predict(X_test)
gnb_acc = metrics.accuracy_score (y_test,y_pred)
gnb_auc = metrics.roc_auc_score(y_test,pred)
fpr, tpr, _ = metrics.roc_curve(y_test, pred)
plt.plot(fpr, tpr, color='orange', label='GaussianNB ROC curve (area = %0.2f)' % gnb_auc)
#GradientBoostingClassifier
models[3].fit(X_train,y_train)
pred = models[3].predict_proba(X_test)[:,1]
y_pred = gbc.predict(X_test)
gbc_acc = metrics.accuracy_score (y_test,y_pred)
gbc_auc = metrics.roc_auc_score(y_test,pred)
fpr, tpr, _ = metrics.roc_curve(y_test, pred)
plt.plot(fpr, tpr, color='purple', label='GradientBoostingClassifier ROC curve (area = %0.2f)' % gbc_auc)
#XGBOOST
classifier = xgb.XGBClassifier()
classifier = classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
pred = classifier.predict_proba(X_test)[:,1]
xgb_acc = metrics.accuracy_score (y_test,y_pred)
xgb_auc = metrics.roc_auc_score(y_test,pred)
fpr, tpr, _ = metrics.roc_curve(y_test, pred)
plt.plot(fpr, tpr, color='Blue', label='XGBOOST ROC curve (area = %0.2f)' % xgb_auc)
plt.plot([0, 1], [0, 1], color='g', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc=0)
[22:59:27] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
<matplotlib.legend.Legend at 0x2990c92c8e0>
# rf,log,gnb,gbc
print('rf Model AUC \t\t\t', rf_auc)
print('log Model AUC \t\t\t', log_auc)
print('gnb Model AUC \t\t\t', gnb_auc)
print('gbc Model AUC \t\t\t', gbc_auc)
print('xgb Model AUC \t\t\t', xgb_auc)
rf Model AUC 0.6993460734108748 log Model AUC 0.6614108772493958 gnb Model AUC 0.6737852542336276 gbc Model AUC 0.7393912593017424 xgb Model AUC 0.7066053169611468
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.80 0.93 0.86 4496
1 0.59 0.30 0.40 1483
accuracy 0.78 5979
macro avg 0.70 0.62 0.63 5979
weighted avg 0.75 0.78 0.75 5979
# Precision is the fraction of predicted positives events that are actually positive
# Recall (also known as sensitivity) is the fraction of positives events that you predicted correctly
# The f1 score is the harmonic mean of recall and precision, with a higher score as a better model
#H2O pip install h2o
import h2o
h2o.init(nthreads = -1, max_mem_size = 8)
Checking whether there is an H2O instance running at http://localhost:54321 ..... not found. Attempting to start a local H2O server... ; Java HotSpot(TM) 64-Bit Server VM (build 25.281-b09, mixed mode) Starting server from C:\Users\Gavin Ng\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar Ice root: C:\Users\GAVINN~1\AppData\Local\Temp\tmphwra_yj6 JVM stdout: C:\Users\GAVINN~1\AppData\Local\Temp\tmphwra_yj6\h2o_gavin_ng_started_from_python.out JVM stderr: C:\Users\GAVINN~1\AppData\Local\Temp\tmphwra_yj6\h2o_gavin_ng_started_from_python.err Server is running at http://127.0.0.1:54321 Connecting to H2O server at http://127.0.0.1:54321 ... successful.
| H2O_cluster_uptime: | 10 secs |
| H2O_cluster_timezone: | Asia/Singapore |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.34.0.1 |
| H2O_cluster_version_age: | 1 month and 1 day |
| H2O_cluster_name: | H2O_from_python_gavin_ng_zff80x |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 7.111 Gb |
| H2O_cluster_total_cores: | 0 |
| H2O_cluster_allowed_cores: | 0 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://127.0.0.1:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| H2O_API_Extensions: | Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 |
| Python_version: | 3.8.5 final |
#Import data
data= h2o.import_file('train.csv')
data['label'] = data['label'].asfactor() #encode the binary repsonse as a factor
data['label'].levels() #optional: after encoding, this shows the two factor levels, '0' and '1'
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
[['0', '1']]
# Partition data into 70%, 15%, 15% chunks
# Setting a seed will guarantee reproducibility
splits = data.split_frame(ratios=[0.7,0.15], seed=1)
train = splits[0]
valid = splits[1]
test = splits[2]
y = 'label'
x = list(data.columns)
x.remove(y) #remove the response
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
# Initialize and train the DL estimator:
dl_fit1 = H2ODeepLearningEstimator(model_id='dl_fit1', seed=1)
dl_fit1.train(x=x, y=y, training_frame=train)
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100% Model Details ============= H2ODeepLearningEstimator : Deep Learning Model Key: dl_fit1 Status of Neuron Layers: predicting label, 2-class classification, bernoulli distribution, CrossEntropy loss, 60,202 weights/biases, 746.0 KB, 140,100 training samples, mini-batch size 1
| layer | units | type | dropout | l1 | l2 | mean_rate | rate_rms | momentum | mean_weight | weight_rms | mean_bias | bias_rms | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 97 | Input | 0 | ||||||||||
| 1 | 2 | 200 | Rectifier | 0 | 0 | 0 | 0.0123603 | 0.0116127 | 0 | 0.00417988 | 0.103188 | 0.238184 | 0.0857536 | |
| 2 | 3 | 200 | Rectifier | 0 | 0 | 0 | 0.0828055 | 0.116261 | 0 | -0.0265887 | 0.0779646 | 0.912515 | 0.0498806 | |
| 3 | 4 | 2 | Softmax | 0 | 0 | 0.00319736 | 0.00143035 | 0 | 0.010144 | 0.369074 | 0.000298099 | 0.0252621 |
ModelMetricsBinomial: deeplearning ** Reported on train data. ** MSE: 0.12765627480055952 RMSE: 0.3572901829053795 LogLoss: 0.40224836478735354 Mean Per-Class Error: 0.2412267796611065 AUC: 0.8429021139153298 AUCPR: 0.7068695494114038 Gini: 0.6858042278306595 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.26412954647725784:
| 0 | 1 | Error | Rate | ||
|---|---|---|---|---|---|
| 0 | 0 | 6406.0 | 1099.0 | 0.1464 | (1099.0/7505.0) |
| 1 | 1 | 864.0 | 1657.0 | 0.3427 | (864.0/2521.0) |
| 2 | Total | 7270.0 | 2756.0 | 0.1958 | (1963.0/10026.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
| metric | threshold | value | idx | |
|---|---|---|---|---|
| 0 | max f1 | 0.264130 | 0.628008 | 218.0 |
| 1 | max f2 | 0.118903 | 0.711924 | 306.0 |
| 2 | max f0point5 | 0.413591 | 0.677956 | 152.0 |
| 3 | max accuracy | 0.413591 | 0.829443 | 152.0 |
| 4 | max precision | 0.999705 | 1.000000 | 0.0 |
| 5 | max recall | 0.010901 | 1.000000 | 393.0 |
| 6 | max specificity | 0.999705 | 1.000000 | 0.0 |
| 7 | max absolute_mcc | 0.311634 | 0.505997 | 195.0 |
| 8 | max min_per_class_accuracy | 0.196474 | 0.755256 | 255.0 |
| 9 | max mean_per_class_accuracy | 0.230392 | 0.758773 | 236.0 |
| 10 | max tns | 0.999705 | 7505.000000 | 0.0 |
| 11 | max fns | 0.999705 | 2471.000000 | 0.0 |
| 12 | max fps | 0.001212 | 7505.000000 | 399.0 |
| 13 | max tps | 0.010901 | 2521.000000 | 393.0 |
| 14 | max tnr | 0.999705 | 1.000000 | 0.0 |
| 15 | max fnr | 0.999705 | 0.980167 | 0.0 |
| 16 | max fpr | 0.001212 | 1.000000 | 399.0 |
| 17 | max tpr | 0.010901 | 1.000000 | 393.0 |
Gains/Lift Table: Avg response rate: 25.14 %, avg score: 21.55 %
| group | cumulative_data_fraction | lower_threshold | lift | cumulative_lift | response_rate | score | cumulative_response_rate | cumulative_score | capture_rate | cumulative_capture_rate | gain | cumulative_gain | kolmogorov_smirnov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.010074 | 0.991175 | 3.937617 | 3.937617 | 0.990099 | 0.997745 | 0.990099 | 0.997745 | 0.039667 | 0.039667 | 293.761709 | 293.761709 | 0.039534 |
| 1 | 2 | 0.020048 | 0.917458 | 3.857683 | 3.897849 | 0.970000 | 0.959962 | 0.980100 | 0.978948 | 0.038477 | 0.078144 | 285.768346 | 289.784911 | 0.077611 |
| 2 | 3 | 0.030022 | 0.842091 | 3.857683 | 3.884505 | 0.970000 | 0.883041 | 0.976744 | 0.947085 | 0.038477 | 0.116620 | 285.768346 | 288.450504 | 0.115688 |
| 3 | 4 | 0.040096 | 0.775987 | 3.701360 | 3.838491 | 0.930693 | 0.808670 | 0.965174 | 0.912309 | 0.037287 | 0.153907 | 270.136006 | 283.849100 | 0.152042 |
| 4 | 5 | 0.050070 | 0.718439 | 3.539524 | 3.778936 | 0.890000 | 0.746736 | 0.950199 | 0.879327 | 0.035303 | 0.189211 | 253.952400 | 277.893582 | 0.185880 |
| 5 | 6 | 0.100040 | 0.518851 | 3.087925 | 3.433775 | 0.776447 | 0.607847 | 0.863410 | 0.743722 | 0.154304 | 0.343514 | 208.792490 | 243.377484 | 0.325260 |
| 6 | 7 | 0.150010 | 0.410780 | 2.317928 | 3.062073 | 0.582834 | 0.460563 | 0.769947 | 0.649398 | 0.115827 | 0.459342 | 131.792821 | 206.207327 | 0.413239 |
| 7 | 8 | 0.200080 | 0.337005 | 1.742905 | 2.731952 | 0.438247 | 0.369593 | 0.686939 | 0.579377 | 0.087267 | 0.546608 | 74.290541 | 173.195250 | 0.462931 |
| 8 | 9 | 0.300020 | 0.241147 | 1.385200 | 2.283334 | 0.348303 | 0.286401 | 0.574136 | 0.481783 | 0.138437 | 0.685046 | 38.520025 | 128.333356 | 0.514359 |
| 9 | 10 | 0.400060 | 0.181834 | 0.908007 | 1.939416 | 0.228315 | 0.208980 | 0.487659 | 0.413566 | 0.090837 | 0.775883 | -9.199257 | 93.941631 | 0.502065 |
| 10 | 11 | 0.500000 | 0.139224 | 0.710461 | 1.693772 | 0.178643 | 0.159766 | 0.425893 | 0.362836 | 0.071004 | 0.846886 | -28.953913 | 69.377231 | 0.463408 |
| 11 | 12 | 0.600040 | 0.107285 | 0.602695 | 1.511866 | 0.151545 | 0.122526 | 0.380153 | 0.322771 | 0.060294 | 0.907180 | -39.730511 | 51.186562 | 0.410311 |
| 12 | 13 | 0.699980 | 0.079218 | 0.448503 | 1.360043 | 0.112774 | 0.093016 | 0.341978 | 0.289968 | 0.044823 | 0.952003 | -55.149677 | 36.004329 | 0.336680 |
| 13 | 14 | 0.800020 | 0.053708 | 0.313243 | 1.229144 | 0.078764 | 0.066622 | 0.309064 | 0.262039 | 0.031337 | 0.983340 | -68.675726 | 22.914428 | 0.244899 |
| 14 | 15 | 0.899960 | 0.029426 | 0.119072 | 1.105871 | 0.029940 | 0.041709 | 0.278067 | 0.237571 | 0.011900 | 0.995240 | -88.092835 | 10.587123 | 0.127285 |
| 15 | 16 | 1.000000 | 0.000007 | 0.047581 | 1.000000 | 0.011964 | 0.016491 | 0.251446 | 0.215455 | 0.004760 | 1.000000 | -95.241882 | 0.000000 | 0.000000 |
Scoring History:
| timestamp | duration | training_speed | epochs | iterations | samples | training_rmse | training_logloss | training_r2 | training_auc | training_pr_auc | training_lift | training_classification_error | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-10-15 23:00:38 | 0.000 sec | None | 0.0 | 0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 1 | 2021-10-15 23:00:45 | 9.982 sec | 2703 obs/sec | 1.0 | 1 | 14010.0 | 0.457790 | 0.876820 | -0.113434 | 0.695196 | 0.484662 | 3.819489 | 0.341213 | |
| 2 | 2021-10-15 23:01:11 | 34.031 sec | 3918 obs/sec | 8.0 | 8 | 112080.0 | 0.390472 | 0.471703 | 0.189952 | 0.813720 | 0.654867 | 3.898241 | 0.230999 | |
| 3 | 2021-10-15 23:01:17 | 41.014 sec | 4108 obs/sec | 10.0 | 10 | 140100.0 | 0.357290 | 0.402248 | 0.321775 | 0.842902 | 0.706870 | 3.937617 | 0.195791 |
Variable Importances:
| variable | relative_importance | scaled_importance | percentage | |
|---|---|---|---|---|
| 0 | min_Speed | 1.000000 | 1.000000 | 0.014049 |
| 1 | mean_second | 0.990630 | 0.990630 | 0.013917 |
| 2 | var_Resultant Acceleration | 0.965330 | 0.965330 | 0.013562 |
| 3 | max_second | 0.951215 | 0.951215 | 0.013363 |
| 4 | bookingID | 0.936759 | 0.936759 | 0.013160 |
| 5 | max_Bearing | 0.931562 | 0.931562 | 0.013087 |
| 6 | min_harsh_yaw | 0.918171 | 0.918171 | 0.012899 |
| 7 | std_Bearing | 0.865941 | 0.865941 | 0.012165 |
| 8 | mean_Bearing | 0.861074 | 0.861074 | 0.012097 |
| 9 | mean_Speed | 0.851646 | 0.851646 | 0.011964 |
| 10 | mean_gyro_y | 0.835922 | 0.835922 | 0.011744 |
| 11 | min_Resultant Acceleration | 0.835084 | 0.835084 | 0.011732 |
| 12 | var_Bearing | 0.825196 | 0.825196 | 0.011593 |
| 13 | min_Resultant gyro | 0.822430 | 0.822430 | 0.011554 |
| 14 | std_Resultant Acceleration | 0.816845 | 0.816845 | 0.011476 |
| 15 | max_Accuracy | 0.812802 | 0.812802 | 0.011419 |
| 16 | min_Bearing | 0.811187 | 0.811187 | 0.011396 |
| 17 | max_acceleration_x | 0.809063 | 0.809063 | 0.011366 |
| 18 | max_Resultant Acceleration | 0.807742 | 0.807742 | 0.011348 |
| 19 | mean_Resultant Acceleration | 0.805028 | 0.805028 | 0.011309 |
See the whole table with table.as_data_frame()
#Train a DL with new architecture and more epochs
dl_fit2 = H2ODeepLearningEstimator(model_id='dl_fit2',
epochs=20,
hidden=[10,10],
stopping_rounds=0, #disable early stopping
seed=1)
dl_fit2.train(x=x, y=y, training_frame=train)
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100% Model Details ============= H2ODeepLearningEstimator : Deep Learning Model Key: dl_fit2 Status of Neuron Layers: predicting label, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,112 weights/biases, 49.0 KB, 280,200 training samples, mini-batch size 1
| layer | units | type | dropout | l1 | l2 | mean_rate | rate_rms | momentum | mean_weight | weight_rms | mean_bias | bias_rms | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 97 | Input | 0 | ||||||||||
| 1 | 2 | 10 | Rectifier | 0 | 0 | 0 | 0.00247723 | 0.00296249 | 0 | 0.0030383 | 0.164107 | 0.26273 | 0.159559 | |
| 2 | 3 | 10 | Rectifier | 0 | 0 | 0 | 0.000684916 | 0.000467619 | 0 | 0.00686666 | 0.273461 | 1.00521 | 0.172292 | |
| 3 | 4 | 2 | Softmax | 0 | 0 | 0.000932928 | 0.000365406 | 0 | 0.590856 | 1.5725 | 0.000917013 | 0.0376395 |
ModelMetricsBinomial: deeplearning ** Reported on train data. ** MSE: 0.15141419650666418 RMSE: 0.389119771415774 LogLoss: 0.4682834081823579 Mean Per-Class Error: 0.3132543133349419 AUC: 0.7627901377925758 AUCPR: 0.5738845827727845 Gini: 0.5255802755851515 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2585481864837233:
| 0 | 1 | Error | Rate | ||
|---|---|---|---|---|---|
| 0 | 0 | 5786.0 | 1719.0 | 0.229 | (1719.0/7505.0) |
| 1 | 1 | 1002.0 | 1519.0 | 0.3975 | (1002.0/2521.0) |
| 2 | Total | 6788.0 | 3238.0 | 0.2714 | (2721.0/10026.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
| metric | threshold | value | idx | |
|---|---|---|---|---|
| 0 | max f1 | 0.258548 | 0.527522 | 227.0 |
| 1 | max f2 | 0.114582 | 0.668014 | 330.0 |
| 2 | max f0point5 | 0.421891 | 0.549677 | 142.0 |
| 3 | max accuracy | 0.507575 | 0.789747 | 110.0 |
| 4 | max precision | 0.989162 | 0.979381 | 4.0 |
| 5 | max recall | 0.000528 | 1.000000 | 399.0 |
| 6 | max specificity | 0.999588 | 0.999734 | 0.0 |
| 7 | max absolute_mcc | 0.309995 | 0.360628 | 196.0 |
| 8 | max min_per_class_accuracy | 0.222656 | 0.682269 | 252.0 |
| 9 | max mean_per_class_accuracy | 0.258548 | 0.686746 | 227.0 |
| 10 | max tns | 0.999588 | 7503.000000 | 0.0 |
| 11 | max fns | 0.999588 | 2477.000000 | 0.0 |
| 12 | max fps | 0.000528 | 7505.000000 | 399.0 |
| 13 | max tps | 0.000528 | 2521.000000 | 399.0 |
| 14 | max tnr | 0.999588 | 0.999734 | 0.0 |
| 15 | max fnr | 0.999588 | 0.982547 | 0.0 |
| 16 | max fpr | 0.000528 | 1.000000 | 399.0 |
| 17 | max tpr | 0.000528 | 1.000000 | 399.0 |
Gains/Lift Table: Avg response rate: 25.14 %, avg score: 23.33 %
| group | cumulative_data_fraction | lower_threshold | lift | cumulative_lift | response_rate | score | cumulative_response_rate | cumulative_score | capture_rate | cumulative_capture_rate | gain | cumulative_gain | kolmogorov_smirnov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.010074 | 9.858663e-01 | 3.898241 | 3.898241 | 0.980198 | 0.996019 | 0.980198 | 0.996019 | 0.039270 | 0.039270 | 289.824091 | 289.824091 | 0.039004 |
| 1 | 2 | 0.020048 | 8.930025e-01 | 3.658834 | 3.779133 | 0.920000 | 0.947266 | 0.950249 | 0.971764 | 0.036493 | 0.075764 | 265.883380 | 277.913290 | 0.074431 |
| 2 | 3 | 0.030022 | 7.687535e-01 | 3.380444 | 3.646678 | 0.850000 | 0.826221 | 0.916944 | 0.923411 | 0.033717 | 0.109480 | 238.044427 | 264.667820 | 0.106149 |
| 3 | 4 | 0.040096 | 6.760741e-01 | 3.346975 | 3.571380 | 0.841584 | 0.716132 | 0.898010 | 0.871333 | 0.033717 | 0.143197 | 234.697452 | 257.137952 | 0.137734 |
| 4 | 5 | 0.050070 | 6.037383e-01 | 2.505506 | 3.359054 | 0.630000 | 0.638585 | 0.844622 | 0.824969 | 0.024990 | 0.168187 | 150.550575 | 235.905407 | 0.157794 |
| 5 | 6 | 0.100040 | 4.497780e-01 | 2.167104 | 2.763673 | 0.544910 | 0.515282 | 0.694915 | 0.670280 | 0.108290 | 0.276478 | 116.710411 | 176.367328 | 0.235705 |
| 6 | 7 | 0.150010 | 3.827897e-01 | 1.714632 | 2.414225 | 0.431138 | 0.412837 | 0.607048 | 0.584523 | 0.085680 | 0.362158 | 71.463182 | 141.422529 | 0.283410 |
| 7 | 8 | 0.200080 | 3.347747e-01 | 1.695372 | 2.234333 | 0.426295 | 0.357233 | 0.561815 | 0.527643 | 0.084887 | 0.447045 | 69.537163 | 123.433270 | 0.329923 |
| 8 | 9 | 0.300020 | 2.687375e-01 | 1.270098 | 1.913135 | 0.319361 | 0.299568 | 0.481051 | 0.451669 | 0.126934 | 0.573979 | 27.009765 | 91.313472 | 0.365984 |
| 9 | 10 | 0.400060 | 2.246552e-01 | 1.015065 | 1.688561 | 0.255234 | 0.246211 | 0.424582 | 0.400292 | 0.101547 | 0.675526 | 1.506508 | 68.856134 | 0.367997 |
| 10 | 11 | 0.500000 | 1.882034e-01 | 0.928759 | 1.536692 | 0.233533 | 0.205571 | 0.386395 | 0.361371 | 0.092820 | 0.768346 | -7.124110 | 53.669179 | 0.358486 |
| 11 | 12 | 0.600040 | 1.569581e-01 | 0.773194 | 1.409400 | 0.194417 | 0.172185 | 0.354388 | 0.329829 | 0.077350 | 0.845696 | -22.680590 | 40.939987 | 0.328175 |
| 12 | 13 | 0.699980 | 1.276519e-01 | 0.555668 | 1.287508 | 0.139721 | 0.141933 | 0.323739 | 0.303003 | 0.055534 | 0.901230 | -44.433228 | 28.750765 | 0.268851 |
| 13 | 14 | 0.800020 | 9.641794e-02 | 0.543218 | 1.194437 | 0.136590 | 0.112420 | 0.300337 | 0.279171 | 0.054344 | 0.955573 | -45.678158 | 19.443670 | 0.207805 |
| 14 | 15 | 0.899960 | 4.969258e-02 | 0.329432 | 1.098378 | 0.082834 | 0.074747 | 0.276183 | 0.256470 | 0.032923 | 0.988497 | -67.056842 | 9.837828 | 0.118277 |
| 15 | 16 | 1.000000 | 6.912780e-07 | 0.114988 | 1.000000 | 0.028913 | 0.024444 | 0.251446 | 0.233258 | 0.011503 | 1.000000 | -88.501216 | 0.000000 | 0.000000 |
Scoring History:
| timestamp | duration | training_speed | epochs | iterations | samples | training_rmse | training_logloss | training_r2 | training_auc | training_pr_auc | training_lift | training_classification_error | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-10-15 23:01:21 | 0.000 sec | None | 0.0 | 0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 1 | 2021-10-15 23:01:22 | 1.679 sec | 56040 obs/sec | 2.0 | 1 | 28020.0 | 0.409698 | 0.520959 | 0.108215 | 0.716311 | 0.514953 | 3.858865 | 0.333333 | |
| 2 | 2021-10-15 23:01:27 | 6.938 sec | 47518 obs/sec | 18.0 | 9 | 252180.0 | 0.390027 | 0.470242 | 0.191796 | 0.762111 | 0.572207 | 3.898241 | 0.272192 | |
| 3 | 2021-10-15 23:01:28 | 7.438 sec | 49628 obs/sec | 20.0 | 10 | 280200.0 | 0.389120 | 0.468283 | 0.195551 | 0.762790 | 0.573885 | 3.898241 | 0.271394 |
Variable Importances:
| variable | relative_importance | scaled_importance | percentage | |
|---|---|---|---|---|
| 0 | min_Speed | 1.000000 | 1.000000 | 0.020915 |
| 1 | max_second | 0.938228 | 0.938228 | 0.019623 |
| 2 | mean_second | 0.804709 | 0.804709 | 0.016830 |
| 3 | mean_Resultant Acceleration | 0.776494 | 0.776494 | 0.016240 |
| 4 | var_Resultant Acceleration | 0.775851 | 0.775851 | 0.016227 |
| 5 | mean_Speed | 0.762504 | 0.762504 | 0.015948 |
| 6 | sum_gyro_y | 0.724917 | 0.724917 | 0.015161 |
| 7 | std_second | 0.721182 | 0.721182 | 0.015083 |
| 8 | sum_Resultant Acceleration | 0.696680 | 0.696680 | 0.014571 |
| 9 | var_velocity | 0.689576 | 0.689576 | 0.014422 |
| 10 | min_second | 0.656991 | 0.656991 | 0.013741 |
| 11 | mean_acceleration_y | 0.653596 | 0.653596 | 0.013670 |
| 12 | std_Bearing | 0.642811 | 0.642811 | 0.013444 |
| 13 | max_Speed | 0.640526 | 0.640526 | 0.013396 |
| 14 | max_acceleration_z | 0.640335 | 0.640335 | 0.013392 |
| 15 | sum_second | 0.630518 | 0.630518 | 0.013187 |
| 16 | std_acceleration_y | 0.605743 | 0.605743 | 0.012669 |
| 17 | min_acceleration_y | 0.598346 | 0.598346 | 0.012514 |
| 18 | std_Resultant Acceleration | 0.597575 | 0.597575 | 0.012498 |
| 19 | var_Resultant Acc & Gyro | 0.583125 | 0.583125 | 0.012196 |
See the whole table with table.as_data_frame()
#Train a DL with early stopping
dl_fit3 = H2ODeepLearningEstimator(model_id='dl_fit3',
epochs=20,
hidden=[10,10],
score_interval=1, #used for early stopping
stopping_rounds=3, #used for early stopping
stopping_metric='AUC', #used for early stopping
stopping_tolerance=0.0005, #used for early stopping
seed=1)
dl_fit3.train(x=x, y=y, training_frame=train, validation_frame=valid)
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100% Model Details ============= H2ODeepLearningEstimator : Deep Learning Model Key: dl_fit3 Status of Neuron Layers: predicting label, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,112 weights/biases, 62.8 KB, 280,200 training samples, mini-batch size 1
| layer | units | type | dropout | l1 | l2 | mean_rate | rate_rms | momentum | mean_weight | weight_rms | mean_bias | bias_rms | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 97 | Input | 0 | ||||||||||
| 1 | 2 | 10 | Rectifier | 0 | 0 | 0 | 0.00302724 | 0.00294017 | 0 | 0.00610924 | 0.167079 | 0.260987 | 0.194858 | |
| 2 | 3 | 10 | Rectifier | 0 | 0 | 0 | 0.000597243 | 0.00043906 | 0 | 0.0103945 | 0.266963 | 1.00519 | 0.129221 | |
| 3 | 4 | 2 | Softmax | 0 | 0 | 0.000760739 | 0.000423139 | 0 | 0.597493 | 1.57397 | 0.00222188 | 0.0294572 |
ModelMetricsBinomial: deeplearning ** Reported on train data. ** MSE: 0.15603509510545877 RMSE: 0.39501277840781146 LogLoss: 0.4813610633855809 Mean Per-Class Error: 0.31759641397338967 AUC: 0.7590152908770855 AUCPR: 0.5682787151523321 Gini: 0.518030581754171 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.19454070883948682:
| 0 | 1 | Error | Rate | ||
|---|---|---|---|---|---|
| 0 | 0 | 5610.0 | 1895.0 | 0.2525 | (1895.0/7505.0) |
| 1 | 1 | 968.0 | 1553.0 | 0.384 | (968.0/2521.0) |
| 2 | Total | 6578.0 | 3448.0 | 0.2856 | (2863.0/10026.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
| metric | threshold | value | idx | |
|---|---|---|---|---|
| 0 | max f1 | 0.194541 | 0.520355 | 247.0 |
| 1 | max f2 | 0.113092 | 0.667439 | 317.0 |
| 2 | max f0point5 | 0.302753 | 0.553455 | 180.0 |
| 3 | max accuracy | 0.340743 | 0.789547 | 163.0 |
| 4 | max precision | 0.958701 | 0.977099 | 11.0 |
| 5 | max recall | 0.008958 | 1.000000 | 395.0 |
| 6 | max specificity | 0.999841 | 0.999600 | 0.0 |
| 7 | max absolute_mcc | 0.302753 | 0.364508 | 180.0 |
| 8 | max min_per_class_accuracy | 0.176697 | 0.677112 | 262.0 |
| 9 | max mean_per_class_accuracy | 0.182440 | 0.682404 | 257.0 |
| 10 | max tns | 0.999841 | 7502.000000 | 0.0 |
| 11 | max fns | 0.999841 | 2478.000000 | 0.0 |
| 12 | max fps | 0.000074 | 7505.000000 | 399.0 |
| 13 | max tps | 0.008958 | 2521.000000 | 395.0 |
| 14 | max tnr | 0.999841 | 0.999600 | 0.0 |
| 15 | max fnr | 0.999841 | 0.982943 | 0.0 |
| 16 | max fpr | 0.000074 | 1.000000 | 399.0 |
| 17 | max tpr | 0.008958 | 1.000000 | 395.0 |
Gains/Lift Table: Avg response rate: 25.14 %, avg score: 19.96 %
| group | cumulative_data_fraction | lower_threshold | lift | cumulative_lift | response_rate | score | cumulative_response_rate | cumulative_score | capture_rate | cumulative_capture_rate | gain | cumulative_gain | kolmogorov_smirnov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.010074 | 9.825261e-01 | 3.858865 | 3.858865 | 0.970297 | 0.996086 | 0.970297 | 0.996086 | 0.038873 | 0.038873 | 285.886474 | 285.886474 | 0.038474 |
| 1 | 2 | 0.020048 | 8.625644e-01 | 3.738374 | 3.798919 | 0.940000 | 0.931826 | 0.955224 | 0.964116 | 0.037287 | 0.076160 | 273.837366 | 279.891893 | 0.074961 |
| 2 | 3 | 0.030022 | 7.156316e-01 | 3.499754 | 3.699529 | 0.880000 | 0.781637 | 0.930233 | 0.903492 | 0.034907 | 0.111067 | 249.975407 | 269.952861 | 0.108269 |
| 3 | 4 | 0.040096 | 6.090867e-01 | 2.913837 | 3.502128 | 0.732673 | 0.663568 | 0.880597 | 0.843212 | 0.029353 | 0.140420 | 191.383664 | 250.212839 | 0.134025 |
| 4 | 5 | 0.050070 | 5.357200e-01 | 2.465736 | 3.295676 | 0.620000 | 0.573740 | 0.828685 | 0.789533 | 0.024593 | 0.165014 | 146.573582 | 229.567569 | 0.153555 |
| 5 | 6 | 0.100040 | 3.716366e-01 | 2.254423 | 2.775569 | 0.566866 | 0.436216 | 0.697906 | 0.613050 | 0.112654 | 0.277668 | 125.442332 | 177.556857 | 0.237295 |
| 6 | 7 | 0.150010 | 2.994025e-01 | 1.905146 | 2.485621 | 0.479042 | 0.331158 | 0.625000 | 0.519149 | 0.095200 | 0.372868 | 90.514647 | 148.562079 | 0.297718 |
| 7 | 8 | 0.200080 | 2.570957e-01 | 1.394324 | 2.212525 | 0.350598 | 0.276811 | 0.556331 | 0.458504 | 0.069814 | 0.442681 | 39.432433 | 121.252466 | 0.324094 |
| 8 | 9 | 0.300020 | 2.080160e-01 | 1.214531 | 1.880081 | 0.305389 | 0.230403 | 0.472739 | 0.382521 | 0.121380 | 0.564062 | 21.453087 | 88.008125 | 0.352736 |
| 9 | 10 | 0.400060 | 1.780251e-01 | 1.050751 | 1.672697 | 0.264207 | 0.192292 | 0.420593 | 0.334952 | 0.105117 | 0.669179 | 5.075096 | 67.269699 | 0.359519 |
| 10 | 11 | 0.500000 | 1.541572e-01 | 0.897006 | 1.517652 | 0.225549 | 0.166234 | 0.381608 | 0.301228 | 0.089647 | 0.758826 | -10.299354 | 51.765173 | 0.345768 |
| 11 | 12 | 0.600040 | 1.335773e-01 | 0.824740 | 1.402128 | 0.207378 | 0.143757 | 0.352560 | 0.274974 | 0.082507 | 0.841333 | -17.525962 | 40.212811 | 0.322345 |
| 12 | 13 | 0.699980 | 1.154760e-01 | 0.635049 | 1.292608 | 0.159681 | 0.124657 | 0.325021 | 0.253513 | 0.063467 | 0.904800 | -36.495118 | 29.260781 | 0.273620 |
| 13 | 14 | 0.800020 | 9.521998e-02 | 0.515463 | 1.195428 | 0.129611 | 0.105721 | 0.300586 | 0.235032 | 0.051567 | 0.956367 | -48.453726 | 19.542834 | 0.208865 |
| 14 | 15 | 0.899960 | 6.418738e-02 | 0.309586 | 1.097056 | 0.077844 | 0.080844 | 0.275851 | 0.217909 | 0.030940 | 0.987307 | -69.041370 | 9.705599 | 0.116687 |
| 15 | 16 | 1.000000 | 2.701642e-08 | 0.126883 | 1.000000 | 0.031904 | 0.035340 | 0.251446 | 0.199645 | 0.012693 | 1.000000 | -87.311687 | 0.000000 | 0.000000 |
ModelMetricsBinomial: deeplearning ** Reported on validation data. ** MSE: 0.1662288383690995 RMSE: 0.4077117098748814 LogLoss: 0.5273283767277683 Mean Per-Class Error: 0.34300477278678376 AUC: 0.7205467875931347 AUCPR: 0.526376998375618 Gini: 0.44109357518626946 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.17549606918458813:
| 0 | 1 | Error | Rate | ||
|---|---|---|---|---|---|
| 0 | 0 | 1496.0 | 715.0 | 0.3234 | (715.0/2211.0) |
| 1 | 1 | 277.0 | 483.0 | 0.3645 | (277.0/760.0) |
| 2 | Total | 1773.0 | 1198.0 | 0.3339 | (992.0/2971.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
| metric | threshold | value | idx | |
|---|---|---|---|---|
| 0 | max f1 | 0.175496 | 0.493361 | 242.0 |
| 1 | max f2 | 0.096003 | 0.652857 | 327.0 |
| 2 | max f0point5 | 0.321957 | 0.517094 | 143.0 |
| 3 | max accuracy | 0.359772 | 0.776506 | 127.0 |
| 4 | max precision | 0.999745 | 1.000000 | 0.0 |
| 5 | max recall | 0.000195 | 1.000000 | 399.0 |
| 6 | max specificity | 0.999745 | 1.000000 | 0.0 |
| 7 | max absolute_mcc | 0.321957 | 0.320269 | 143.0 |
| 8 | max min_per_class_accuracy | 0.170168 | 0.648575 | 248.0 |
| 9 | max mean_per_class_accuracy | 0.195814 | 0.656995 | 223.0 |
| 10 | max tns | 0.999745 | 2211.000000 | 0.0 |
| 11 | max fns | 0.999745 | 749.000000 | 0.0 |
| 12 | max fps | 0.000195 | 2211.000000 | 399.0 |
| 13 | max tps | 0.000195 | 760.000000 | 399.0 |
| 14 | max tnr | 0.999745 | 1.000000 | 0.0 |
| 15 | max fnr | 0.999745 | 0.985526 | 0.0 |
| 16 | max fpr | 0.000195 | 1.000000 | 399.0 |
| 17 | max tpr | 0.000195 | 1.000000 | 399.0 |
Gains/Lift Table: Avg response rate: 25.58 %, avg score: 20.02 %
| group | cumulative_data_fraction | lower_threshold | lift | cumulative_lift | response_rate | score | cumulative_response_rate | cumulative_score | capture_rate | cumulative_capture_rate | gain | cumulative_gain | kolmogorov_smirnov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.010098 | 9.783734e-01 | 3.909211 | 3.909211 | 1.000000 | 0.995684 | 1.000000 | 0.995684 | 0.039474 | 0.039474 | 290.921053 | 290.921053 | 0.039474 |
| 1 | 2 | 0.020195 | 8.356389e-01 | 2.997061 | 3.453136 | 0.766667 | 0.907631 | 0.883333 | 0.951658 | 0.030263 | 0.069737 | 199.706140 | 245.313596 | 0.066571 |
| 2 | 3 | 0.030293 | 7.518811e-01 | 2.997061 | 3.301111 | 0.766667 | 0.794901 | 0.844444 | 0.899406 | 0.030263 | 0.100000 | 199.706140 | 230.111111 | 0.093668 |
| 3 | 4 | 0.040054 | 6.499898e-01 | 2.830808 | 3.186499 | 0.724138 | 0.696716 | 0.815126 | 0.850011 | 0.027632 | 0.127632 | 183.080762 | 218.649934 | 0.117681 |
| 4 | 5 | 0.050151 | 5.726689e-01 | 2.606140 | 3.069649 | 0.666667 | 0.608545 | 0.785235 | 0.801394 | 0.026316 | 0.153947 | 160.614035 | 206.964853 | 0.139474 |
| 5 | 6 | 0.100303 | 3.738657e-01 | 2.046432 | 2.558040 | 0.523490 | 0.445868 | 0.654362 | 0.623631 | 0.102632 | 0.256579 | 104.643236 | 155.804045 | 0.209994 |
| 6 | 7 | 0.150118 | 2.997642e-01 | 1.664056 | 2.261382 | 0.425676 | 0.333937 | 0.578475 | 0.527499 | 0.082895 | 0.339474 | 66.405583 | 126.138187 | 0.254444 |
| 7 | 8 | 0.200269 | 2.553582e-01 | 1.521706 | 2.076152 | 0.389262 | 0.276938 | 0.531092 | 0.464754 | 0.076316 | 0.415789 | 52.170611 | 107.615215 | 0.289602 |
| 8 | 9 | 0.300236 | 2.044271e-01 | 1.092473 | 1.748627 | 0.279461 | 0.227648 | 0.447309 | 0.385807 | 0.109211 | 0.525000 | 9.247298 | 74.862668 | 0.302024 |
| 9 | 10 | 0.400202 | 1.758377e-01 | 1.066148 | 1.578151 | 0.272727 | 0.189607 | 0.403701 | 0.336798 | 0.106579 | 0.631579 | 6.614833 | 57.815059 | 0.310910 |
| 10 | 11 | 0.500168 | 1.524711e-01 | 0.895038 | 1.441620 | 0.228956 | 0.164364 | 0.368775 | 0.302335 | 0.089474 | 0.721053 | -10.496190 | 44.162003 | 0.296810 |
| 11 | 12 | 0.600135 | 1.335062e-01 | 0.763415 | 1.328649 | 0.195286 | 0.142952 | 0.339877 | 0.275786 | 0.076316 | 0.797368 | -23.658515 | 32.864923 | 0.265030 |
| 12 | 13 | 0.700101 | 1.147068e-01 | 0.710766 | 1.240423 | 0.181818 | 0.124049 | 0.317308 | 0.254120 | 0.071053 | 0.868421 | -28.923445 | 24.042257 | 0.226178 |
| 13 | 14 | 0.800067 | 9.462213e-02 | 0.605467 | 1.161087 | 0.154882 | 0.104515 | 0.297013 | 0.235427 | 0.060526 | 0.928947 | -39.453305 | 16.108651 | 0.173181 |
| 14 | 15 | 0.900034 | 6.763757e-02 | 0.368545 | 1.073059 | 0.094276 | 0.082686 | 0.274495 | 0.218462 | 0.036842 | 0.965789 | -63.145490 | 7.305928 | 0.088358 |
| 15 | 16 | 1.000000 | 1.645909e-11 | 0.342220 | 1.000000 | 0.087542 | 0.035535 | 0.255806 | 0.200175 | 0.034211 | 1.000000 | -65.777955 | 0.000000 | 0.000000 |
Scoring History:
| timestamp | duration | training_speed | epochs | iterations | samples | training_rmse | training_logloss | training_r2 | ... | training_pr_auc | training_lift | training_classification_error | validation_rmse | validation_logloss | validation_r2 | validation_auc | validation_pr_auc | validation_lift | validation_classification_error | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-10-15 23:01:30 | 0.000 sec | None | 0.0 | 0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 1 | 2021-10-15 23:01:30 | 0.586 sec | 94026 obs/sec | 2.0 | 1 | 28020.0 | 0.406971 | 0.515634 | 0.120047 | ... | 0.511481 | 3.819489 | 0.337522 | 0.409922 | 0.526335 | 0.117317 | 0.710266 | 0.518119 | 3.909211 | 0.365197 | |
| 2 | 2021-10-15 23:01:32 | 2.192 sec | 95198 obs/sec | 12.0 | 6 | 168120.0 | 0.395823 | 0.484846 | 0.167595 | ... | 0.555132 | 3.858865 | 0.323658 | 0.405631 | 0.523267 | 0.135698 | 0.717970 | 0.531195 | 3.909211 | 0.289465 | |
| 3 | 2021-10-15 23:01:33 | 3.769 sec | 95958 obs/sec | 20.0 | 10 | 280200.0 | 0.395013 | 0.481361 | 0.171001 | ... | 0.568279 | 3.858865 | 0.285558 | 0.407712 | 0.527328 | 0.126809 | 0.720547 | 0.526377 | 3.909211 | 0.333894 |
4 rows × 21 columns
Variable Importances:
| variable | relative_importance | scaled_importance | percentage | |
|---|---|---|---|---|
| 0 | min_Speed | 1.000000 | 1.000000 | 0.023783 |
| 1 | mean_second | 0.772823 | 0.772823 | 0.018380 |
| 2 | var_Resultant Acceleration | 0.750026 | 0.750026 | 0.017838 |
| 3 | max_second | 0.694218 | 0.694218 | 0.016511 |
| 4 | max_Speed | 0.670228 | 0.670228 | 0.015940 |
| 5 | std_Bearing | 0.663596 | 0.663596 | 0.015782 |
| 6 | mean_Speed | 0.656259 | 0.656259 | 0.015608 |
| 7 | std_second | 0.623407 | 0.623407 | 0.014826 |
| 8 | min_acceleration_y | 0.600485 | 0.600485 | 0.014281 |
| 9 | mean_Resultant Acceleration | 0.588827 | 0.588827 | 0.014004 |
| 10 | max_Bearing | 0.581170 | 0.581170 | 0.013822 |
| 11 | sum_Resultant Acceleration | 0.580805 | 0.580805 | 0.013813 |
| 12 | std_Accuracy | 0.577396 | 0.577396 | 0.013732 |
| 13 | var_velocity | 0.552825 | 0.552825 | 0.013148 |
| 14 | max_acceleration_z | 0.552549 | 0.552549 | 0.013141 |
| 15 | max_Resultant gyro | 0.544910 | 0.544910 | 0.012960 |
| 16 | sum_second | 0.512927 | 0.512927 | 0.012199 |
| 17 | std_velocity | 0.510603 | 0.510603 | 0.012144 |
| 18 | min_Resultant Acceleration | 0.509642 | 0.509642 | 0.012121 |
| 19 | sum_gyro_y | 0.507941 | 0.507941 | 0.012080 |
See the whole table with table.as_data_frame()
#Compare model performance
dl_perf1 = dl_fit1.model_performance(test)
dl_perf2 = dl_fit2.model_performance(test)
dl_perf3 = dl_fit3.model_performance(test)
# Retreive test set AUC
print (dl_perf1.auc())
print (dl_perf2.auc())
print (dl_perf3.auc())
0.7052852430689942 0.7285749332680788 0.7234064866673492
dl_fit3.scoring_history()
| timestamp | duration | training_speed | epochs | iterations | samples | training_rmse | training_logloss | training_r2 | ... | training_pr_auc | training_lift | training_classification_error | validation_rmse | validation_logloss | validation_r2 | validation_auc | validation_pr_auc | validation_lift | validation_classification_error | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-10-15 23:01:30 | 0.000 sec | None | 0.0 | 0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 1 | 2021-10-15 23:01:30 | 0.586 sec | 94026 obs/sec | 2.0 | 1 | 28020.0 | 0.406971 | 0.515634 | 0.120047 | ... | 0.511481 | 3.819489 | 0.337522 | 0.409922 | 0.526335 | 0.117317 | 0.710266 | 0.518119 | 3.909211 | 0.365197 | |
| 2 | 2021-10-15 23:01:32 | 2.192 sec | 95198 obs/sec | 12.0 | 6 | 168120.0 | 0.395823 | 0.484846 | 0.167595 | ... | 0.555132 | 3.858865 | 0.323658 | 0.405631 | 0.523267 | 0.135698 | 0.717970 | 0.531195 | 3.909211 | 0.289465 | |
| 3 | 2021-10-15 23:01:33 | 3.769 sec | 95958 obs/sec | 20.0 | 10 | 280200.0 | 0.395013 | 0.481361 | 0.171001 | ... | 0.568279 | 3.858865 | 0.285558 | 0.407712 | 0.527328 | 0.126809 | 0.720547 | 0.526377 | 3.909211 | 0.333894 |
4 rows × 21 columns
Standard classifcal feature importance
feat_importances = pd.Series(models[0].feature_importances_, index = list(X_train.columns.values))
feat_importances = feat_importances.nlargest(10)
feat_importances.sort_values(ascending=True, inplace=True)
fig, ax = plt.subplots()
feat_importances.plot.barh(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
Here we use the various SHAP implementation integrated into the models to explain the dataset (xxxx samples).
import shap
#!pip uninstall numpy
#!pip install numpy==1.19.3
# Need to load JS vis in the notebook
shap.initjs()
data = X #choose dataset to explain
model = models[0] #choose model to explain
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(data)
data.head()
| mean_Accuracy | mean_Bearing | mean_acceleration_x | mean_acceleration_y | mean_acceleration_z | mean_gyro_x | mean_gyro_y | mean_gyro_z | mean_second | mean_Speed | ... | var_gyro_y | var_gyro_z | var_second | var_Speed | var_Resultant Acceleration | var_Resultant gyro | var_Resultant Acc & Gyro | var_harsh_yaw | var_harsh_pitch | var_velocity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.103896 | 176.344256 | -0.710727 | -9.611757 | -1.618491 | 0.003523 | -0.006026 | -0.004169 | 903.838162 | 9.008759 | ... | 0.010060 | 0.004056 | 284746.821782 | 51.860896 | 0.384068 | 0.008376 | 0.881207 | 0.835091 | 0.624860 | 0.524973 |
| 1 | 3.962085 | 125.563518 | -0.502344 | 9.532651 | -2.167925 | -0.003029 | -0.007780 | 0.000667 | 604.198697 | 10.945002 | ... | 0.011037 | 0.001494 | 92143.056703 | 35.329667 | 0.350473 | 0.007079 | 0.737030 | 0.186349 | 0.213582 | 0.625931 |
| 2 | 4.326701 | 199.992537 | 0.425984 | 9.853977 | 0.181858 | 0.009894 | 0.002275 | 0.003592 | 331.694030 | 4.594452 | ... | 0.015525 | 0.001851 | 134012.845528 | 5.595322 | 0.379097 | 0.008711 | 0.868908 | 0.151731 | 0.077450 | 0.954959 |
| 3 | 10.000000 | 153.848696 | -0.362334 | -9.405008 | -2.618961 | -0.022986 | 0.022829 | -0.000490 | 535.007619 | 6.408752 | ... | 0.013147 | 0.004483 | 100132.100991 | 30.974094 | 0.400138 | 0.008221 | 0.846020 | 0.366474 | 0.658357 | 0.580653 |
| 4 | 4.754031 | 207.623656 | 0.513989 | 9.521987 | 2.383745 | 0.006200 | -0.000739 | 0.003029 | 600.821813 | 7.785974 | ... | 0.017002 | 0.005226 | 109509.715893 | 22.924285 | 0.498934 | 0.011541 | 1.183742 | 0.587364 | 0.590992 | 0.668006 |
5 rows × 96 columns
#Predicted Safe Driving
instance_to_explain = 488
shap.force_plot(explainer.expected_value[1], shap_values[1][instance_to_explain], data.iloc[instance_to_explain])
#Predicted Dangerous Drving
instance_to_explain = 223
shap.force_plot(explainer.expected_value[1], shap_values[1][instance_to_explain], data.iloc[instance_to_explain])
To keep the browser happy we only visualize 1,000 individuals.
shap.force_plot(explainer.expected_value[1], shap_values[1][:1000,:], data.iloc[:1000,:])
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(data)
shap.summary_plot(shap_values, data, plot_type="bar")
Rather than use a typical feature importance bar chart, we use a density scatter plot of SHAP values for each feature to identify how much impact each feature has on the model output for individuals in the validation dataset. Features are sorted by the sum of the SHAP value magnitudes across all samples.
Trip duration - It is interesting to note that that the longer rides are associated with dangerous driving. This could possiblity be explained by driver fatigue, verocious driving conditions etc.
Speeding - We could infer that the longer the ride there is higher chances of dangerous encounter. Higher max speed may indicated dangerous drives.
Acceleration - It is interesting to note that more acceleration can indicator more dangerous driving styles of drivers
Accuracy - Low accuracy may not neceesary relate to dangerous driver. It could indicators driver may take certain higher risk manevours to meet service quality time. This used as dependent variable for other use cases to improve customer statisfaction.
#Density plot
import xgboost
model_test = xgboost.XGBClassifier().fit(X_train, y_train)
# compute SHAP values
explainer = shap.Explainer(model_test, X_train)
shap_values = explainer(X_train)
[23:01:54] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
100%|===================| 13894/13949 [01:30<00:00]
shap.plots.beeswarm(shap_values, max_display=10)
shap.plots.beeswarm(shap_values.abs, color="shap_red" ,max_display=10)
shap.plots.bar(shap_values.abs.mean(0) ,max_display=10)
SHAP dependence plots show the effect of a single feature across the whole dataset. They plot a feature’s value vs. the SHAP value of that feature across many samples. SHAP dependence plots are similar to partial dependence plots, but account for the interaction effects present in the features, and are only defined in regions of the input space supported by data. The vertical dispersion of SHAP values at a single feature value is driven by interaction effects, and another feature is chosen for coloring to highlight possible interactions.
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
features = data.columns.tolist()
for feat in features:
shap.dependence_plot(feat, shap_values[0], X)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))